# Calculating emission rates for each pollutant using OGE data
library(pacman)
p_load(
  here, data.table, fst, ggplot2, readxl, magrittr, 
  fixest, broom, dplyr, purrr, stringr, janitor
)
# TODO: 
calc_avg_emissions_rates = function(){
# Loading data ----------------------------------------------------------------
  print('Loading data')
  # Unit information table 
  oge_plant_info_dt =
    read.fst(
      path = here("Data/electricity-generation/plant-info-dt-oge.fst"),
      as.data.table = TRUE
    )
  # generation by unit in raw CEMS data
  unit_weight_dt = 
    map_dfr(
      2019:2020,
      \(yr){
        read.fst(
          here(paste0("Data/electricity-generation/cems/raw_cems_",yr,".fst")),
          as.data.table = TRUE
        )[,year := yr][,
          .(tot_heat_input_mm_btu = sum(heat_input_mm_btu, na.rm = TRUE)), 
          keyby = .(orispl_code, unitid, year)
        ]
      }
    )[,
      .(tot_heat_input_mm_btu = sum(tot_heat_input_mm_btu, na.rm = TRUE)), 
      keyby = .(orispl_code, unitid)
    ]
  invisible(gc())
  # Reading in egrid PM2.5 emissions rates
  egrid_unit_pm_dt = 
    read_xlsx(
      path = here('Data/electricity-generation/egrid/egrid_draft_pm2.5_emissions_7-20-20.xlsx'),
      sheet = '2018 PM Unit-level Data',
      skip = 1
    ) |> 
    data.table() %>% 
    # Two plants have duplicates because of multiple prime mover types
    # Taking the max of the two
    .[PRMVR != 'HY',
      # PM25RT is in lbs/mmbtu, converting to tons
      .(pm_tons_per_mmbtu = max(PM25RT/2000, na.rm = TRUE)),
      keyby = .(orispl_code = ORISPL, unitid = UNITID)
    ] %>% .[!is.nan(pm_tons_per_mmbtu) & !is.infinite(pm_tons_per_mmbtu)] 
  # Hourly emissions data
  elec_gen_dt =
    read.fst(
      path = here("Data/electricity-generation/plant-model-fit/elec-gen-fit-dt.fst"),
      as.data.table = TRUE
    ) |> setkey(plant_id_eia, datetime_utc)
  # Shaped plant info table 
  shaped_plant_info_dt =
    read.fst(
      path = here("Data/electricity-generation/shaped-plant-info-dt-oge.fst"),
      as.data.table = TRUE
    )[shaped_plant_id %in% unique(elec_gen_dt$plant_id_eia)]
  # Monthly data 
  monthly_plant_dt = 
    rbind(
      fread(here(
        'Data/electricity-generation/open-grid-emissions/montly-plant-data/plant_data_2019.csv'
      ))[,year:=2019],
      fread(here(
        'Data/electricity-generation/open-grid-emissions/montly-plant-data/plant_data_2020.csv'
      ))[,year:=2020]
    )
  
# Calculating annual average emissions ----------------------------------------
  print('calculating rates')
  # Converting unit PM rates to plant PM rates 
  egrid_plant_pm_dt = 
    merge(
      egrid_unit_pm_dt,
      unit_weight_dt,
      by = c('orispl_code','unitid'),
      all.x = TRUE
    ) %>%
    .[,':='(
      n = .N,
      wt = ifelse(is.na(tot_heat_input_mm_btu),0,tot_heat_input_mm_btu)/
        sum(tot_heat_input_mm_btu, na.rm =TRUE)), 
      by = .(orispl_code)
    ] %>% 
    .[,wt := ifelse(is.nan(wt), 1/n, wt)] %>% 
    .[,.(
      pm_tons_per_mmbtu = weighted.mean(pm_tons_per_mmbtu, w = wt, na.rm = TRUE)),
      keyby = .(plant_id_eia = orispl_code)
    ]
  emissions_dt_raw = 
    rbind(
      # Here for the non-shaped plants
      elec_gen_dt[plant_id_eia < 900000,.(
        tot_co2e_tons_for_electricity = sum(co2e_mass_lb_for_electricity, na.rm = TRUE)/2000,
        tot_nox_tons_for_electricity = sum(nox_mass_lb_for_electricity, na.rm = TRUE)/2000,
        tot_so2_tons_for_electricity = sum(so2_mass_lb_for_electricity, na.rm = TRUE)/2000,
        tot_net_generation_mwh = sum(net_generation_mwh, na.rm = TRUE),
        tot_fuel_consumed_for_electricity_mmbtu = 
          sum(fuel_consumed_for_electricity_mmbtu, na.rm = TRUE)), 
        keyby = .(plant_id_eia)
      ][,shaped := FALSE],
      # Here for the shaped plants 
      merge(
        shaped_plant_info_dt,
        monthly_plant_dt, 
        by = c('plant_id_eia','year')
      )[!(plant_id_eia %in% unique(elec_gen_dt$plant_id_eia)),.(
        tot_co2e_tons_for_electricity = sum(co2e_mass_lb_for_electricity, na.rm = TRUE)/2000,
        tot_nox_tons_for_electricity = sum(nox_mass_lb_for_electricity, na.rm = TRUE)/2000,
        tot_so2_tons_for_electricity = sum(so2_mass_lb_for_electricity, na.rm = TRUE)/2000,
        tot_net_generation_mwh = sum(net_generation_mwh, na.rm = TRUE),
        tot_fuel_consumed_for_electricity_mmbtu = 
          sum(fuel_consumed_for_electricity_mmbtu, na.rm = TRUE)), 
        keyby = .(plant_id_eia)
      ][,shaped := TRUE]
    )
  # Calculating emissions rates for each pollutant
  emissions_dt = 
    merge(
      emissions_dt_raw,
      egrid_plant_pm_dt,
      by = c('plant_id_eia'),
      all.x = TRUE
    ) %>% 
    .[,.(
      plant_id_eia,  
      shaped,
      co2e_tons_per_mwh = ifelse(
        tot_net_generation_mwh == 0, 0,
        tot_co2e_tons_for_electricity/tot_net_generation_mwh
      ), 
      nox_tons_per_mwh = ifelse(
        tot_net_generation_mwh == 0, 0,
        tot_nox_tons_for_electricity/tot_net_generation_mwh
      ), 
      so2_tons_per_mwh = ifelse(
        tot_net_generation_mwh == 0, 0,
        tot_so2_tons_for_electricity/tot_net_generation_mwh
      ), 
      pm25_tons_per_mwh = ifelse(
        tot_net_generation_mwh == 0, 0,
        pm_tons_per_mmbtu*tot_fuel_consumed_for_electricity_mmbtu/
          tot_net_generation_mwh
      ),
      tot_net_generation_mwh
    )]
  # Currently have PM2.5 emissions rates for 90% of electricity generation
  print(emissions_dt[, .(
    missing_pm25_rate = 
      sum(ifelse(is.na(pm25_tons_per_mwh), tot_net_generation_mwh, 0))/
      sum(tot_net_generation_mwh))
  ])
  # Calulating 95th percentile and median emissions rates by fuel type 
  pm25_quantile_dt = 
    merge(
      emissions_dt, 
      oge_plant_info_dt[,.SD[which.max(year)], by= plant_id_eia],
      by = 'plant_id_eia'
      # One plant is wind in 2020 but natural gas in 2019. No other shaped wind 
      # plants so switching it to natural gas
    )[,fuel_category := ifelse(plant_id_eia == 62736, 'natural_gas', fuel_category)
    ][,':='(
      pm25_tons_per_mwh_p50 = quantile(pm25_tons_per_mwh, probs = 0.5, na.rm = TRUE),
      pm25_tons_per_mwh_p95 = quantile(pm25_tons_per_mwh, probs = 0.95, na.rm = TRUE),
      co2e_tons_per_mwh_p95 = quantile(co2e_tons_per_mwh, probs = 0.95, na.rm = TRUE),
      so2_tons_per_mwh_p95 = quantile(so2_tons_per_mwh, probs = 0.95, na.rm = TRUE),
      nox_tons_per_mwh_p95 = quantile(nox_tons_per_mwh, probs = 0.95, na.rm = TRUE)),
      by = .(fuel_category, shaped)
    ][,.(
      plant_id_eia, 
      pm25_tons_per_mwh_p50, 
      pm25_tons_per_mwh_p95,
      co2e_tons_per_mwh_p95,
      so2_tons_per_mwh_p95,
      nox_tons_per_mwh_p95
    )]
  # Setting missing values to median and censoring high values
  emissions_dt = 
    merge(
      emissions_dt, 
      pm25_quantile_dt,
      by = 'plant_id_eia'
    )[,':='(
      pm25_tons_per_mwh = fcase(
        is.na(pm25_tons_per_mwh), pm25_tons_per_mwh_p50, 
        pm25_tons_per_mwh <= pm25_tons_per_mwh_p95, pm25_tons_per_mwh,
        pm25_tons_per_mwh > pm25_tons_per_mwh_p95, pm25_tons_per_mwh_p95
      ),
      co2e_tons_per_mwh = ifelse(
        co2e_tons_per_mwh > co2e_tons_per_mwh_p95, co2e_tons_per_mwh_p95,
        co2e_tons_per_mwh
      ),
      so2_tons_per_mwh = ifelse(
        so2_tons_per_mwh > so2_tons_per_mwh_p95, so2_tons_per_mwh_p95,
        so2_tons_per_mwh
      ),
      nox_tons_per_mwh = ifelse(
        nox_tons_per_mwh > nox_tons_per_mwh_p95, nox_tons_per_mwh_p95,
        nox_tons_per_mwh
      ),
      pm25_tons_per_mwh_p50 = NULL,
      pm25_tons_per_mwh_p95 = NULL,
      co2e_tons_per_mwh_p95 = NULL,
      so2_tons_per_mwh_p95 = NULL,
      nox_tons_per_mwh_p95 = NULL
    )]
  # Saving the results 
  fwrite(
    emissions_dt,
    file = here('Data/electricity-generation/output/emissions-rate-avg-oge.csv')
  )
  print('done.')
}

# Fitting model with different emissions rates each tercile
# For Testing on a single unit:
#plant_id_eia_in = 3

emissions_rates_median = function(plant_id_eia_in, elec_gen_dt, split = 0.5, print = FALSE){
  if(print == TRUE) {print(plant_id_eia_in)}
  # Filtering to single unit, removing hours with no electricity but positive emissions
  plant_dt = elec_gen_dt[.(plant_id_eia_in)][
    !(net_generation_mwh == 0 & (
      nox_mass_lb_for_electricity > 0
      |so2_mass_lb_for_electricity > 0
      |co2e_mass_lb_for_electricity > 0
    ))
  ]
  # Finding quantile of generation
  plant_quants = quantile(plant_dt[net_generation_mwh>0]$net_generation_mwh, probs = split)
  plant_min = min(plant_dt$net_generation_mwh)
  # Adding indicators 
  plant_dt[, ':='(
    net_gen_over_split = net_generation_mwh >= plant_quants[1] & net_generation_mwh > plant_min,
    net_gen_msplit = net_generation_mwh - plant_quants[1]
  )]
  # Regression with split
  median_mod = feols(
    data = plant_dt,
    fml = c(
      nox_mass_lb_for_electricity, 
      so2_mass_lb_for_electricity, 
      co2e_mass_lb_for_electricity
      ) ~ -1 + net_generation_mwh + i(net_gen_over_split, net_gen_msplit, ref = TRUE)
  )
  # NOX: Re-estimating if marginal rate is negative
  if(length(coef(median_mod[lhs = 'nox'])) == 2){
    if(
      (coef(median_mod[lhs = 'nox'], keep = 'net_generation_mwh') + 
       coef(median_mod[lhs = 'nox'], keep = 'net_gen_over_split') < 0
      )|coef(median_mod[lhs = 'nox'], keep = 'net_generation_mwh') < 0
    ){# Fitting model without median indicator
      rate_mod_nox = feols(
        data = plant_dt, 
        fml = nox_mass_lb_for_electricity ~ -1 + net_generation_mwh 
      )
      # Collecting the results 
      median_out_nox = 
        rate_mod_nox |> 
        tidy() |> 
        mutate(
          pollutant = 'nox',
          plant_id_eia = plant_id_eia_in,
          split = split,
          net_generation_mwh_split = plant_quants[1]
        )
    }else{
      median_out_nox = 
        median_mod[lhs = 'nox'] |> 
        tidy() |> 
        mutate(
          pollutant = 'nox',
          plant_id_eia = plant_id_eia_in,
          split = split,
          net_generation_mwh_split = plant_quants[1]
        )
    }
  } else if(length(coef(median_mod[lhs = 'nox'])) == 1) {
    median_out_nox = 
      median_mod[lhs = 'nox'] |> 
      tidy() |> 
      mutate(
        pollutant = 'nox',
        plant_id_eia = plant_id_eia_in,
        split = split,
        net_generation_mwh_split = plant_quants[1]
      )
  } else {
    median_out_nox = tibble(
      term = NA,
      estimate = NA, 
      std.error = NA,
      statistic = NA, 
      p.value = NA,
      pollutant = 'nox',
      plant_id_eia = plant_id_eia_in,
      split = split,
      net_generation_mwh_split = plant_quants[1]
    )
  }
  # SO2: Re-estimating if marginal rate is negative
  if(length(coef(median_mod[lhs = 'so2'])) == 2){
    if(
      (coef(median_mod[lhs = 'so2'], keep = 'net_generation_mwh') + 
       coef(median_mod[lhs = 'so2'], keep = 'net_gen_over_split') < 0
      )|coef(median_mod[lhs = 'so2'], keep = 'net_generation_mwh') < 0
    ){# Fitting model without median indicator
      rate_mod_so2 = feols(
        data = plant_dt, 
        fml = so2_mass_lb_for_electricity ~ -1 + net_generation_mwh 
      )
      # Collecting the results 
      median_out_so2 = 
        rate_mod_so2 |> 
        tidy() |> 
        mutate(
          pollutant = 'so2',
          plant_id_eia = plant_id_eia_in,
          split = split,
          net_generation_mwh_split = plant_quants[1]
        )
    }else{
      median_out_so2 = 
        median_mod[lhs = 'so2'] |> 
        tidy() |> 
        mutate(
          pollutant = 'so2',
          plant_id_eia = plant_id_eia_in,
          split = split,
          net_generation_mwh_split = plant_quants[1]
        )
    }
  } else if(length(coef(median_mod[lhs = 'so2'])) == 1) {
    median_out_so2 = 
      median_mod[lhs = 'so2'] |> 
      tidy() |> 
      mutate(
        pollutant = 'so2',
        plant_id_eia = plant_id_eia_in,
        split = split,
        net_generation_mwh_split = plant_quants[1]
      )
  } else {
    median_out_so2 = tibble(
      term = NA,
      estimate = NA, 
      std.error = NA,
      statistic = NA, 
      p.value = NA,
      pollutant = 'so2',
      plant_id_eia = plant_id_eia_in,
      split = split,
      net_generation_mwh_split = plant_quants[1]
    )
  }
  # co2e: Re-estimating if marginal rate is negative
  if(length(coef(median_mod[lhs = 'co2e'])) == 2){
    if(
      (coef(median_mod[lhs = 'co2e'], keep = 'net_generation_mwh') + 
       coef(median_mod[lhs = 'co2e'], keep = 'net_gen_over_split') < 0
      ) | coef(median_mod[lhs = 'co2e'], keep = 'net_generation_mwh') < 0
    ){# Fitting model without median indicator
      rate_mod_co2e = feols(
        data = plant_dt, 
        fml = co2e_mass_lb_for_electricity ~ -1 + net_generation_mwh 
      )
      # Collecting the results 
      median_out_co2e = 
        rate_mod_co2e |> 
        tidy() |> 
        mutate(
          pollutant = 'co2e',
          plant_id_eia = plant_id_eia_in,
          split = split,
          net_generation_mwh_split = plant_quants[1]
        )
    }else{
      median_out_co2e = 
        median_mod[lhs = 'co2e'] |> 
        tidy() |> 
        mutate(
          pollutant = 'co2e',
          plant_id_eia = plant_id_eia_in,
          split = split,
          net_generation_mwh_split = plant_quants[1]
        )
    }
  } else if(length(coef(median_mod[lhs = 'co2e'])) == 1) {
    median_out_co2e = 
      median_mod[lhs = 'co2e'] |> 
      tidy() |> 
      mutate(
        pollutant = 'co2e',
        plant_id_eia = plant_id_eia_in,
        split = split,
        net_generation_mwh_split = plant_quants[1]
      )
  }else{
    median_out_co2e = tibble(
      term = NA,
      estimate = NA, 
      std.error = NA,
      statistic = NA, 
      p.value = NA,
      pollutant = 'co2e',
      plant_id_eia = plant_id_eia_in,
      split = split,
      net_generation_mwh_split = plant_quants[1]
    )
  }
  # Returning the results 
  return(
    rbind(median_out_nox, median_out_so2, median_out_co2e) %>% as.data.table() 
  )
}  

estimate_marginal_emissions_rate = function(){
  print('loading data')
  # Hourly emissions data
  elec_gen_dt =
    read.fst(
      path = here("Data/electricity-generation/plant-model-fit/elec-gen-fit-dt.fst"),
      as.data.table = TRUE
    ) |> setkey(plant_id_eia, datetime_utc)
  # Running models for all units (not shaped plants)
  emissions_rate_dt = 
    map_dfr(
      unique(elec_gen_dt[plant_id_eia < 900000]$plant_id_eia),
      emissions_rates_median,
      elec_gen_dt = elec_gen_dt,
      print = TRUE
    )
   # Cleaning up the results
  clean_emissions_rate_dt = 
    rbind(
      emissions_rate_dt[
        term == 'net_generation_mwh',.(
          plant_id_eia, pollutant, 
          output = 'emissions_rate_low',
          net_generation_mwh_split,
          emissions_rate = estimate
        )],
      emissions_rate_dt[!is.na(term)] %>% 
        dcast(
          plant_id_eia + pollutant + net_generation_mwh_split ~ term, 
          value.var = 'estimate', 
          fill= 0
        ) %>% 
        .[,.(
          plant_id_eia, pollutant, 
          output = 'emissions_rate_high',
          net_generation_mwh_split,
          emissions_rate = net_generation_mwh + `net_gen_over_split::TRUE:net_gen_msplit`
        )]
    ) 
  # Saving results 
  write.fst(
    emissions_rate_dt,
    here('Data/electricity-generation/plant-model-fit/emissions-rate-dt-raw.fst')
  )
  write.fst(
    clean_emissions_rate_dt,
    here('Data/electricity-generation/plant-model-fit/emissions-rate-dt.fst')
  )
  write.csv(
    clean_emissions_rate_dt,
    here('Data/electricity-generation/output/emissions-rate-median-oge.csv')
  )
  print('done.')
}

